import os
import json
import random

split = 'train'
data_path = 'G:/DataSet/prosocial-dialog/'
output_path = 'G:/DataSet/prosocial-dialog_instruction_cls/'
if os.path.exists(output_path) == False:
    os.makedirs(output_path)

ques = ["Is the following sentence harmful?",
    "Is there a moral issue with the speaker's words?",
    "Is it necessary to consider the safety of given utterance?",
    "Do you think the following sentence is harmful?",
    "Do you think this sentence is toxic?",
    "Do you think this sentence is inappropriate?",
    "Does this statement have a negative impact on society?"]

output_list = []
with open(data_path + f'{split}.json', 'r', encoding='utf=8') as f:
    lines = f.readlines()
    for line in lines:
        ques_idx = random.randint(0, len(ques)-1)
        json_obj = json.loads(line)

        data_dict = {}
        data_dict['instruction'] = ques[ques_idx]
        data_dict['input'] = json_obj['context']
        if json_obj['safety_label'] == '__casual__': # 3 "casual" in safety_annotations
            data_dict['output'] = 'No. ' + json_obj['safety_annotation_reasons'][0]
        elif json_obj['safety_label'] == '__possibly_needs_caution__': # 1 "needs caution" and 2 "casual"
            idx = json_obj['safety_annotations'].index('needs caution')
            data_dict['output'] = 'Possibly. ' + json_obj['safety_annotation_reasons'][idx]
        elif json_obj['safety_label'] == '__probably_needs_caution__': # 3 "needs caution" and 1 "casual"
            idx = json_obj['safety_annotations'].index('needs caution')
            data_dict['output'] = 'Probably. ' + json_obj['safety_annotation_reasons'][idx]
        else: # json_obj['safety_label'] =='__needs_caution__' or '__needs_intervention__
            data_dict['output'] = 'Yes. ' + json_obj['safety_annotation_reasons'][0]
        
        output_list.append(data_dict)
        
with open(output_path + f'{split}.json', 'w', encoding='utf-8') as f:
    json.dump(output_list, f, ensure_ascii=False)